C. Anderson provokingly in the magazine Wired:
“The End of Theory: the data deluge makes the scientific method obsolete”
J. Epstein (2008, JASSS): ‘Why model?’
A model is a simplified representation/abstraction of a target system, which implements some theoretical propositions about the logical linkages between objects of interest.
In this tutorial, two kinds of models are presented:
More generally, we want to show how the two shed complementary lights on spatial problems and how they interact with the new massive data.
Let’s bind together the data for all three cities
f = list.files('insideairbnb/')
mont <- read.csv(paste0('insideairbnb/', f[1]))
toro <- read.csv(paste0('insideairbnb/', f[2]))
vanc <- read.csv(paste0('insideairbnb/', f[3]))
toronto = unique(toro$city)
vancouver = unique(vanc$city)
montreal = unique(mont$city)
df = rbind(mont, toro, vanc)
Cleaning here is choosing how to make the data at hand in line with the purpose of the analysis.
First, if we want to take airbnb listings as proxies for residents, we need to identify and remove commercial lettings, as well as listings in neighbourhoods which are different from the host’s neighbourhood, as well as multiproperties.
l <- levels(df$property_type)
lookup = data.frame('type' = 1:length(l))
lookup$type <- as.factor(l)
lookup$property_group <- c(
# [1] "Aparthotel" "Apartment" "Bed and breakfast" "Boat" "Boutique hotel" "Bungalow" "Cabin"
'hotel', 'home', 'hotel', 'other', 'hotel', 'home', 'other',
# [8] "Camper/RV" "Campsite" "Casa particular (Cuba)" "Cave" "Chalet" "Condominium" "Cottage"
'other', 'other', 'home', 'other', 'home', 'home', 'home',
# [15] "Farm stay" "Guest suite" "Guesthouse" "Hostel" "Hotel" "House" "Houseboat"
'home', 'home', 'hotel', 'hotel', 'hotel', 'home', 'home',
# [22] "Hut" "Loft" "Nature lodge" "Other" "Serviced apartment" "Tent" "Timeshare"
'other', 'home', 'other', 'other', 'hotel', 'other', 'other',
# [29] "Tiny house" "Townhouse" "Villa" "Barn" "Castle" "Dorm" "Earth house"
'home', 'home', 'home', 'home', 'home', 'hotel', 'other',
# [36] "In-law" "Parking Space" "Treehouse" "Resort"
'home', 'other', 'other', 'hotel'
)
df = data.frame(df,lookup[match(df$property_type, lookup$type),] )
# dfh = subset(df, property_group == 'home' & as.character(df$host_neighbourhood) == as.character(df$neighbourhood) & df$room_type != "Shared room")
dfh = subset(df, property_group == 'home' & as.character(df$host_neighbourhood) == as.character(df$neighbourhood))
dfh$property_group <- NULL
dfhu = dfh[!duplicated(dfh$host_id),]
dim(df)
## [1] 43211 98
dim(dfhu)
## [1] 24404 97
then we want to keep current listings, i.e. whose last review dates back from less than 2 years for example
dfhu$year = as.numeric(substr(dfhu$last_review, 1, 4))
# pal <- colorFactor(
# palette = brewer.pal(n=10, 'Blues'),
# domain = dfhu$year
# )
# leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
# addCircleMarkers(
# data = dfhu,
# radius = ~ sqrt(0.07 * numPrice),
# lat = ~ latitude,
# color = ~pal(year),
# stroke = FALSE,
# fillOpacity = 0.5,
# layerId = ~ id,
# lng = ~ longitude
# ) %>% addLegend(pal = pal, position = 'topleft', values = dfhu$year)
#
dfhun = subset(dfhu, year >= 2017)
dim(df)[1]
## [1] 43211
dim(dfhun)[1]
## [1] 16905
Then, we want to use a price variable that is usable (e.g. numeric) and to normalise the price by a measure of size: the number of rooms because square feet is mostly not filled by hosts. This implies to remove the small number of shared rooms
dfhun$numPrice <- as.numeric(gsub("[$]",'',dfhun$price))
## Warning: NAs introduced by coercion
summary(dfhun$room_type)
## Entire home/apt Private room Shared room
## 12163 4592 150
final = subset(dfhun, room_type != "Shared room")
final$rooms = ifelse(final$bedrooms == 0, 1, final$bedrooms)
final$priceperroom = as.numeric(ifelse(final$room_type == T, final$numPrice, final$numPrice / final$rooms))
Let’s retrieve census shape files and data from the census API
and map!
pal <- colorQuantile(
palette = 'Blues',
domain = city_data$priceperroom,
n = 10
)
map <- leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
addPolygons(
data = csd.csd.geo,
color = 'black',
fill = F,
weight = 0.7,
opacity = 0.9
) %>% addPolygons(
data = csd.geo,
color = 'grey',
fill = F,
weight = 0.4
) %>%
addCircleMarkers(
data = city_data,
radius = ~ sqrt(4 * rooms),
lat = ~ latitude,
fillColor = ~ pal(priceperroom),
color = 'black',
stroke = T,
fillOpacity = 0.5,
weight = 0.1,
layerId = ~ id,
lng = ~ longitude
) %>%
addLegend(pal = pal, position = 'topleft', values = city_data$priceperroom)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
map
and map!
pal <- colorQuantile(
palette = 'Blues',
domain = city_data$priceperroom,
n = 10
)
pal2 <- colorQuantile(
palette = 'Reds',
domain = -tractTable$Ei,
n = 10)
map <- leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
addPolygons(
data = csd.csd.geo,
color = 'black',
fill = F,
weight = 0.7,
opacity = 0.9
) %>% addPolygons(
data = tractTable,
color = ~ pal2(-Ei),
fill = ~ pal2(-Ei),
weight = 0.4
) %>%
#addCircleMarkers(
# data = city_data,
# radius = ~ sqrt(4 * rooms),
# lat = ~ latitude,
# fillColor = ~ pal(priceperroom),
# color = 'black',
# stroke = T,
# fillOpacity = 0.5,
# weight = 0.1,
# layerId = ~ id,
# lng = ~ longitude
# ) %>%
# addLegend(pal = pal, position = 'topleft', values = city_data$priceperroom)%>%
addLegend(pal = pal2, position = 'topleft', values = -tractTable$Ei)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Reds is 9
## Returning the palette you asked for with that many colors
map
Let’s use the information theory to qualify diversity and segregation of a given city (cf. John Iceland et al on multigroup entropy: https://www.census.gov/hhes/www/housing/resseg/multigroup_entropy.pdf). The measure have been implemented are described below and tested on Canadian metropolisis from package cancensus (example by @dshkol: https://github.com/dshkol/scratchpad/blob/master/content/post/2018-05-10-diversity-and-segregation-i.Rmd).